trainingdataset <- read.csv("pml-training.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
testingdataset <- read.csv("pml-testing.csv", stringsAsFactors = F,na.strings = c("","NA","#DIV/0!"))
dim(trainingdataset); dim(testingdataset)
## [1] 19622 160
## [1] 20 160
set.seed(101)
trainingdataset_x <- createDataPartition(trainingdataset$classe, p = 0.8, list = F)
dataset_v <- trainingdataset[-trainingdataset_x,]
trainingdataset <- trainingdataset[trainingdataset_x,]
dim(trainingdataset); dim(dataset_v)
## [1] 15699 160
## [1] 3923 160
table(trainingdataset$classe)/nrow(trainingdataset)
##
## A B C D E
## 0.2843493 0.1935155 0.1744060 0.1638958 0.1838334
na_belt <- sapply(select(trainingdataset,names(trainingdataset)[grepl("_belt",names(trainingdataset))]),
function(x) sum(is.na(x)))
na_belt
## roll_belt pitch_belt yaw_belt
## 0 0 0
## total_accel_belt kurtosis_roll_belt kurtosis_picth_belt
## 0 15396 15413
## kurtosis_yaw_belt skewness_roll_belt skewness_roll_belt.1
## 15699 15395 15413
## skewness_yaw_belt max_roll_belt max_picth_belt
## 15699 15388 15388
## max_yaw_belt min_roll_belt min_pitch_belt
## 15396 15388 15388
## min_yaw_belt amplitude_roll_belt amplitude_pitch_belt
## 15396 15388 15388
## amplitude_yaw_belt var_total_accel_belt avg_roll_belt
## 15396 15388 15388
## stddev_roll_belt var_roll_belt avg_pitch_belt
## 15388 15388 15388
## stddev_pitch_belt var_pitch_belt avg_yaw_belt
## 15388 15388 15388
## stddev_yaw_belt var_yaw_belt gyros_belt_x
## 15388 15388 0
## gyros_belt_y gyros_belt_z accel_belt_x
## 0 0 0
## accel_belt_y accel_belt_z magnet_belt_x
## 0 0 0
## magnet_belt_y magnet_belt_z
## 0 0
na_arm <- sapply(select(trainingdataset,names(trainingdataset)[grepl("_arm",names(trainingdataset))]),
function(x) sum(is.na(x)))
na_arm
## roll_arm pitch_arm yaw_arm total_accel_arm
## 0 0 0 0
## var_accel_arm avg_roll_arm stddev_roll_arm var_roll_arm
## 15388 15388 15388 15388
## avg_pitch_arm stddev_pitch_arm var_pitch_arm avg_yaw_arm
## 15388 15388 15388 15388
## stddev_yaw_arm var_yaw_arm gyros_arm_x gyros_arm_y
## 15388 15388 0 0
## gyros_arm_z accel_arm_x accel_arm_y accel_arm_z
## 0 0 0 0
## magnet_arm_x magnet_arm_y magnet_arm_z kurtosis_roll_arm
## 0 0 0 15446
## kurtosis_picth_arm kurtosis_yaw_arm skewness_roll_arm skewness_pitch_arm
## 15448 15398 15445 15448
## skewness_yaw_arm max_roll_arm max_picth_arm max_yaw_arm
## 15398 15388 15388 15388
## min_roll_arm min_pitch_arm min_yaw_arm amplitude_roll_arm
## 15388 15388 15388 15388
## amplitude_pitch_arm amplitude_yaw_arm
## 15388 15388
na_fore <- sapply(select(trainingdataset,
names(trainingdataset)[grepl("_forearm",names(trainingdataset))]),
function(x) sum(is.na(x)))
na_fore
## roll_forearm pitch_forearm yaw_forearm
## 0 0 0
## kurtosis_roll_forearm kurtosis_picth_forearm kurtosis_yaw_forearm
## 15448 15449 15699
## skewness_roll_forearm skewness_pitch_forearm skewness_yaw_forearm
## 15447 15449 15699
## max_roll_forearm max_picth_forearm max_yaw_forearm
## 15388 15388 15448
## min_roll_forearm min_pitch_forearm min_yaw_forearm
## 15388 15388 15448
## amplitude_roll_forearm amplitude_pitch_forearm amplitude_yaw_forearm
## 15388 15388 15448
## total_accel_forearm var_accel_forearm avg_roll_forearm
## 0 15388 15388
## stddev_roll_forearm var_roll_forearm avg_pitch_forearm
## 15388 15388 15388
## stddev_pitch_forearm var_pitch_forearm avg_yaw_forearm
## 15388 15388 15388
## stddev_yaw_forearm var_yaw_forearm gyros_forearm_x
## 15388 15388 0
## gyros_forearm_y gyros_forearm_z accel_forearm_x
## 0 0 0
## accel_forearm_y accel_forearm_z magnet_forearm_x
## 0 0 0
## magnet_forearm_y magnet_forearm_z
## 0 0
na_bel <- sapply(select(trainingdataset,
names(trainingdataset)[grepl("_dumbbell",names(trainingdataset))]),
function(x) sum(is.na(x)))
na_bel
## roll_dumbbell pitch_dumbbell yaw_dumbbell
## 0 0 0
## kurtosis_roll_dumbbell kurtosis_picth_dumbbell kurtosis_yaw_dumbbell
## 15392 15390 15699
## skewness_roll_dumbbell skewness_pitch_dumbbell skewness_yaw_dumbbell
## 15391 15389 15699
## max_roll_dumbbell max_picth_dumbbell max_yaw_dumbbell
## 15388 15388 15392
## min_roll_dumbbell min_pitch_dumbbell min_yaw_dumbbell
## 15388 15388 15392
## amplitude_roll_dumbbell amplitude_pitch_dumbbell amplitude_yaw_dumbbell
## 15388 15388 15392
## total_accel_dumbbell var_accel_dumbbell avg_roll_dumbbell
## 0 15388 15388
## stddev_roll_dumbbell var_roll_dumbbell avg_pitch_dumbbell
## 15388 15388 15388
## stddev_pitch_dumbbell var_pitch_dumbbell avg_yaw_dumbbell
## 15388 15388 15388
## stddev_yaw_dumbbell var_yaw_dumbbell gyros_dumbbell_x
## 15388 15388 0
## gyros_dumbbell_y gyros_dumbbell_z accel_dumbbell_x
## 0 0 0
## accel_dumbbell_y accel_dumbbell_z magnet_dumbbell_x
## 0 0 0
## magnet_dumbbell_y magnet_dumbbell_z
## 0 0
dropped_column2 <- c(names(na_belt[na_belt != 0]),
names(na_arm[na_arm != 0]),
names(na_fore[na_fore != 0]),
names(na_bel[na_bel != 0]))
length(dropped_column2)
## [1] 100
dataframe_analysis <- tbl_df(trainingdataset %>%
select(-dropped_column2,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(dropped_column2)` instead of `dropped_column2` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
dataframe_analysis$classe <- as.factor(dataframe_analysis$classe)
dataframe_analysis[,1:52] <- lapply(dataframe_analysis[,1:52],as.numeric)
dim(dataframe_analysis)
## [1] 15699 53
correlation_columns<- cor(select(dataframe_analysis, -classe))
diag(correlation_columns) <- 0
correlation_columns<- which(abs(correlation_columns)>0.8,arr.ind = T)
correlation_columns<- unique(row.names(correlation_columns))
corrplot(cor(select(dataframe_analysis,correlation_columns)),
type="upper", order="hclust",method = "number")
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(correlation_columns)` instead of `correlation_columns` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

correlation_dataframe_1 <- dataframe_analysis %>% binarize(n_bins = 4, thresh_infreq = 0.01)
correlation_x <- correlation_dataframe_1 %>% correlate(target = classe__A)
correlation_x %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_y<- correlation_dataframe_1 %>% correlate(target = classe__B)
correlation_y%>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_z <- correlation_dataframe_1 %>% correlate(target = classe__C)
correlation_z %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_u <- correlation_dataframe_1 %>% correlate(target = classe__D)
correlation_u %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
correlation_v <- correlation_dataframe_1 %>% correlate(target = classe__E)
correlation_v %>% plot_correlation_funnel(interactive = T,limits = c(-0.5,0.5))
column_z <- c("magnet_arm_x", "pitch_forearm" , "magnet_dumbbell_y",
"roll_forearm", "gyros_dumbbell_y")
column_y <- c("magnet_dumbbell_y", "magnet_dumbbell_x" , "roll_dumbbell" ,
"magnet_belt_y" , "accel_dumbbell_x" )
column_x <- c("magnet_dumbbell_y", "roll_dumbbell" , "accel_dumbbell_y" ,
"magnet_dumbbell_x", "magnet_dumbbell_z")
column_w <- c("pitch_forearm" , "magnet_arm_y" , "magnet_forearm_x",
"accel_dumbbell_y", "accel_forearm_x")
column_v <- c("magnet_belt_y" , "magnet_belt_z" , "roll_belt",
"gyros_belt_z" , "magnet_dumbbell_y")
column_finals <- character()
for(c in c(column_z,column_y,column_x,column_w,column_v)){
column_finals <- union(column_finals, c)
}
dataframe_analysis2 <- dataframe_analysis %>% select(column_finals, classe)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(column_finals)` instead of `column_finals` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
data.frame("arm" = sum(grepl("_arm",column_finals)),
"forearm" = sum(grepl("_forearm",column_finals)),
"belt" = sum(grepl("_belt",column_finals)),
"dumbbell" = sum(grepl("_dumbbell",column_finals)))
## arm forearm belt dumbbell
## 1 2 4 4 7
density_plot <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_density(..., alpha = 0.3)+scale_fill_brewer(palette="Set2")
}
point_plot <- function(data, mapping, ...) {
ggplot(data = data, mapping=mapping) +
geom_point(..., alpha = 0.1)+ scale_fill_brewer(palette="Set2")
}
ggpairs(dataframe_analysis2, columns = 1:5,aes(color = classe),
lower = list(continuous = point_plot),diag = list(continuous = density_plot))

ggpairs(dataframe_analysis2, columns = 6:10,aes(color = classe),
lower = list(continuous = point_plot),diag = list(continuous = density_plot))

ggpairs(dataframe_analysis2, columns = 11:17,aes(color = classe),
lower = list(continuous = point_plot),diag = list(continuous = density_plot))

trainingdatasetF <- trainingdataset %>% select(column_finals,classe)
dataset_vF <- dataset_v %>% select(column_finals,classe)
trainingdatasetF[,1:17] <- sapply(trainingdatasetF[,1:17],as.numeric)
dataset_vF[,1:17] <- sapply(dataset_vF[,1:17],as.numeric)
levels <- c("A", "B", "C", "D", "E")
before_processing <- preProcess(trainingdatasetF[,-18],method = c("center","scale","BoxCox"))
xTrain <- predict(before_processing,select(trainingdatasetF,-classe))
yTrain <- factor(trainingdatasetF$classe,levels=levels)
xVal <- predict(before_processing,select(dataset_vF,-classe))
yVal <- factor(dataset_vF$classe,levels=levels)
trControl <- trainControl(method="cv", number=5)
modelCT <- train(x = xTrain,y = yTrain,
method = "rpart", trControl = trControl)
modelRF <- train(x = xTrain,y = yTrain,
method = "rf", trControl = trControl,verbose=FALSE, metric = "Accuracy")
modelGBM <- train(x = xTrain,y = yTrain,
method = "gbm",trControl=trControl, verbose=FALSE)
modelSVM <- svm(x = xTrain,y = yTrain,
kernel = "polynomial", cost = 10)
confusionMatrix(predict(modelCT,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1003 330 319 294 106
## B 19 256 20 109 103
## C 93 173 345 240 212
## D 0 0 0 0 0
## E 1 0 0 0 300
##
## Overall Statistics
##
## Accuracy : 0.4853
## 95% CI : (0.4696, 0.5011)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3271
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.8987 0.33729 0.50439 0.0000 0.41609
## Specificity 0.6263 0.92067 0.77833 1.0000 0.99969
## Pos Pred Value 0.4888 0.50493 0.32455 NaN 0.99668
## Neg Pred Value 0.9396 0.85275 0.88147 0.8361 0.88377
## Prevalence 0.2845 0.19347 0.17436 0.1639 0.18379
## Detection Rate 0.2557 0.06526 0.08794 0.0000 0.07647
## Detection Prevalence 0.5231 0.12924 0.27097 0.0000 0.07673
## Balanced Accuracy 0.7625 0.62898 0.64136 0.5000 0.70789
confusionMatrix(predict(modelRF,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1112 7 0 0 0
## B 3 741 5 3 1
## C 1 7 676 15 4
## D 0 4 3 625 1
## E 0 0 0 0 715
##
## Overall Statistics
##
## Accuracy : 0.9862
## 95% CI : (0.9821, 0.9896)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9826
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9964 0.9763 0.9883 0.9720 0.9917
## Specificity 0.9975 0.9962 0.9917 0.9976 1.0000
## Pos Pred Value 0.9937 0.9841 0.9616 0.9874 1.0000
## Neg Pred Value 0.9986 0.9943 0.9975 0.9945 0.9981
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2835 0.1889 0.1723 0.1593 0.1823
## Detection Prevalence 0.2852 0.1919 0.1792 0.1614 0.1823
## Balanced Accuracy 0.9970 0.9862 0.9900 0.9848 0.9958
plot(modelRF$finalModel,main="Error VS no of tree")

confusionMatrix(predict(modelGBM,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1083 40 2 5 3
## B 18 642 32 16 11
## C 8 54 635 39 10
## D 4 21 14 582 9
## E 3 2 1 1 688
##
## Overall Statistics
##
## Accuracy : 0.9253
## 95% CI : (0.9166, 0.9333)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9055
##
## Mcnemar's Test P-Value : 2.509e-07
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9704 0.8458 0.9284 0.9051 0.9542
## Specificity 0.9822 0.9757 0.9657 0.9854 0.9978
## Pos Pred Value 0.9559 0.8929 0.8512 0.9238 0.9899
## Neg Pred Value 0.9882 0.9635 0.9846 0.9815 0.9898
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2761 0.1637 0.1619 0.1484 0.1754
## Detection Prevalence 0.2888 0.1833 0.1902 0.1606 0.1772
## Balanced Accuracy 0.9763 0.9108 0.9470 0.9452 0.9760
confusionMatrix(predict(modelSVM,xVal),yVal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1096 40 18 17 2
## B 1 676 15 5 6
## C 9 40 640 45 3
## D 10 3 9 575 9
## E 0 0 2 1 701
##
## Overall Statistics
##
## Accuracy : 0.9401
## 95% CI : (0.9322, 0.9473)
## No Information Rate : 0.2845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9241
##
## Mcnemar's Test P-Value : 1.808e-15
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9821 0.8906 0.9357 0.8942 0.9723
## Specificity 0.9726 0.9915 0.9701 0.9905 0.9991
## Pos Pred Value 0.9344 0.9616 0.8684 0.9488 0.9957
## Neg Pred Value 0.9927 0.9742 0.9862 0.9795 0.9938
## Prevalence 0.2845 0.1935 0.1744 0.1639 0.1838
## Detection Rate 0.2794 0.1723 0.1631 0.1466 0.1787
## Detection Prevalence 0.2990 0.1792 0.1879 0.1545 0.1795
## Balanced Accuracy 0.9773 0.9411 0.9529 0.9424 0.9857
testingdataset2 <- testingdataset %>% select(column_finals,problem_id)
xTest <- testingdataset2 %>% select(column_finals)
result <- data.frame("problem_id" = testingdataset$problem_id,
"PREDICTION_RF" = predict(modelRF,xTest),
"PREDICTION_GBM" = predict(modelGBM,xTest),
"PREDICTION_SVM" = predict(modelSVM,xTest))
result
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A D B
## 4 4 E E C
## 5 5 E E A
## 6 6 E D C
## 7 7 E E B
## 8 8 B D A
## 9 9 A B E
## 10 10 E E E
## 11 11 A E C
## 12 12 A D C
## 13 13 E B E
## 14 14 A D B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E E
trainingdatasetF2 <- tbl_df(trainingdataset %>%
select(-dropped_column2,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
xTrain2 <- trainingdatasetF2 %>% select(-classe)
xTrain2 <- sapply(xTrain2,as.numeric)
yTrain2 <- factor(trainingdatasetF2$classe,levels=levels)
dataset_vF2 <- tbl_df(dataset_v %>%
select(-dropped_column2,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
xVal2 <- dataset_vF2 %>% select(-classe)
xVal2 <- sapply(xVal2,as.numeric)
yVal2 <- factor(dataset_vF2$classe,levels=levels)
testingdatasetF2 <- tbl_df(testingdataset %>%
select(-dropped_column2,
-c(X,user_name, raw_timestamp_part_1,
raw_timestamp_part_2, cvtd_timestamp,
new_window,num_window)))
xTest2 <- testingdatasetF2 %>% select(-problem_id)
xTest2 <- sapply(xTest2,as.numeric)
pb_id <- dataset_vF2$classe
library(doParallel)
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.0.3
## Loading required package: parallel
ncores <- makeCluster(detectCores() - 1)
registerDoParallel(cores=ncores)
getDoParWorkers()
## [1] 3
modelRF2 <- train(x = xTrain2,y = yTrain2, method = "rf",
metric = "Accuracy",
trControl=trainControl(method = "cv", number = 4,
p= 0.60, allowParallel = TRUE ))
result2 <- data.frame("problem_id" = testingdataset$problem_id,
"PREDICTION_RF" = predict(modelRF,xTest),
"PREDICTION_GBM" = predict(modelGBM,xTest),
"PREDICTION_SVM" = predict(modelSVM,xTest),
"PREDICTION_RF2_ALL_COL"=predict(modelRF2,xTest2))
result2
## problem_id PREDICTION_RF PREDICTION_GBM PREDICTION_SVM
## 1 1 E E C
## 2 2 A E A
## 3 3 A D B
## 4 4 E E C
## 5 5 E E A
## 6 6 E D C
## 7 7 E E B
## 8 8 B D A
## 9 9 A B E
## 10 10 E E E
## 11 11 A E C
## 12 12 A D C
## 13 13 E B E
## 14 14 A D B
## 15 15 E E B
## 16 16 E E A
## 17 17 E E C
## 18 18 B E A
## 19 19 E E A
## 20 20 E E E
## PREDICTION_RF2_ALL_COL
## 1 B
## 2 A
## 3 B
## 4 A
## 5 A
## 6 E
## 7 D
## 8 B
## 9 A
## 10 A
## 11 B
## 12 C
## 13 B
## 14 A
## 15 E
## 16 E
## 17 A
## 18 B
## 19 B
## 20 B